InĀ [1]:
import scanpy as sc
import pandas as pd
from pathlib import Path
import anndata as ad
import numpy as np
import scvi
import os

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = 50

DPI=300
FONTSIZE=20 #42

sc.settings.set_figure_params(scanpy = True, dpi=100, transparent=True, vector_friendly = True, dpi_save=DPI) 
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42
Global seed set to 0
InĀ [2]:
os.chdir('/data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis')

DIR2SAVE = Path('/data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis')

FIG2SAVE = DIR2SAVE.joinpath('Figures/')
FIG2SAVE
# set the global variable: sc.settings.figdir to save all plots 
sc.settings.figdir = FIG2SAVE
InĀ [3]:
adata = sc.read_mtx('/data/BCI-CRC/SO/data/CRC_multiome/ArchR_final_analysis/Matrices/GEX_decontaminated.mtx').T
adata
Out[3]:
AnnData object with n_obs Ɨ n_vars = 23119 Ɨ 36485
InĀ [4]:
features = pd.read_csv('/data/BCI-CRC/SO/data/CRC_multiome/ArchR_final_analysis/Matrices/GEX_decontaminated_genes.csv')
adata.var_names = features['x'].str.split('_', expand=True)[0]
 
barcodes = pd.read_csv('/data/BCI-CRC/SO/data/CRC_multiome/ArchR_final_analysis/Matrices/GEX_decontaminated_barcodes.csv')
adata.obs_names = barcodes['x']
 
meta = pd.read_csv('/data/BCI-CRC/SO/data/CRC_multiome/ArchR_final_analysis/Matrices/GEX_decontaminated_metadata.csv',
                   index_col=0)
meta = meta.drop(['nCount_ATAC', 'nFeature_ATAC'], axis=1) #drop Signac ATAC QC metrics - use metrics from ArchR
adata.obs = meta

adata
Out[4]:
AnnData object with n_obs Ɨ n_vars = 23119 Ɨ 36485
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Sample', 'Patient', 'Therapy', 'Tissue', 'TSSEnrichment', 'nFrags', 'percent.mt', 'percent.ribo', 'RNA_snn_res.0.5', 'seurat_clusters', 'integrated_snn_res.0.5', 'Clusters_all_cells_preDecon', 'Cell_type_preDecon', 'ident', 'decontX_contamination', 'decontX_clusters', 'integratedRNADecon_snn_res.0.5'
InĀ [5]:
X_CCA = pd.read_csv('/data/BCI-CRC/SO/data/CRC_multiome/ArchR_final_analysis/Matrices/GEX_decontaminated.CCA.csv', index_col=0)
X_CCA = X_CCA.to_numpy()
print(f"X_CCA shape: {np.shape(X_CCA)}")
adata.obsm['X_CCA'] = X_CCA
print(f"X_CCA: {adata.obsm['X_CCA']}")
X_CCA shape: (23119, 2)
X_CCA: [[-12.20550625   3.08438743]
 [  4.90263804  -2.15925419]
 [-11.58478538   3.74730027]
 ...
 [ -4.40180819 -13.12077558]
 [  3.6492203   -1.49928909]
 [ -5.45846741 -13.2389425 ]]
InĀ [6]:
adata.obs.columns
Out[6]:
Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Sample', 'Patient',
       'Therapy', 'Tissue', 'TSSEnrichment', 'nFrags', 'percent.mt',
       'percent.ribo', 'RNA_snn_res.0.5', 'seurat_clusters',
       'integrated_snn_res.0.5', 'Clusters_all_cells_preDecon',
       'Cell_type_preDecon', 'ident', 'decontX_contamination',
       'decontX_clusters', 'integratedRNADecon_snn_res.0.5'],
      dtype='object')
InĀ [7]:
sc.pl.embedding(adata, 'X_CCA', color=['ELF3','COL3A1','CP','SLC11A1','CD79A','TRAC'], use_raw=False, ncols=6, vmax='p99', vmin=1, color_map='plasma_r')
No description has been provided for this image
InĀ [8]:
genes=['EPCAM','ELF3','HNF4A',
          'SLC11A1','CD68','LYZ',
          'CD79A','BANK1','BLK',
          'CD3E','PTPRC','TRAC',
          'ALB','CP','DEFB1',
          'PECAM1','ARL15','VWF',
          'DCN','ACTA2','COL3A1',
          'PROX1']
sc.pl.embedding(adata, 'X_CCA', color=genes, use_raw=False, ncols=6, vmax='p99', vmin=1, color_map='plasma_r')
No description has been provided for this image
InĀ [9]:
adata.obs.Cell_type_preDecon.value_counts()
Out[9]:
Epithelial     17758
Myeloid         1900
T/NK/ILC        1600
Fibroblast       675
Endothelial      496
Hepatocytes      490
B                200
Name: Cell_type_preDecon, dtype: int64
InĀ [10]:
adata.obs.Sample.value_counts()
Out[10]:
CRC08_LM    4319
CRC09_LM    3268
CRC04_LM    2960
CRC10_LM    1980
CRC14_LM    1845
CRC15_LM    1408
CRC06_LM    1393
CRC11_LM    1373
CRC02_LM     975
CRC03_LM     790
CRC12_LM     735
CRC05_LM     708
CRC13_LM     533
CRC07_LM     464
CRC01_LM     368
Name: Sample, dtype: int64
InĀ [11]:
# keep raw decontaminated counts
adata.layers["raw"] = adata.X.copy() # preserve counts
# normalize + log1p 
sc.pp.normalize_total(adata, target_sum=1e4, inplace=True)
adata.layers["normalised"] = adata.X.copy()
sc.pp.log1p(adata)
adata.layers["log1p"] = adata.X.copy()
adata.raw = adata # keep normalised log1p as .raw
InĀ [12]:
sc.pp.highly_variable_genes(adata, 
                            subset=True, # subset for integration (but full lognorm data in .raw)
                            layer='raw', 
                            flavor='seurat_v3', 
                            n_top_genes=2000, 
                            span=0.3, 
                            min_disp=0.5, 
                            min_mean=0.0125, 
                            max_mean=3,
                            batch_key='Sample'
                           )
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/scanpy/preprocessing/_highly_variable_genes.py:62: UserWarning: `flavor='seurat_v3'` expects raw count data, but non-integers were found.
  warnings.warn(
InĀ [13]:
print(adata.X.shape)
print(adata.raw.to_adata().X.shape)
(23119, 2000)
(23119, 36485)
InĀ [14]:
for i in ['nCount_RNA', 'nFeature_RNA','percent.mt','percent.ribo', 'TSSEnrichment', 'nFrags', 'decontX_contamination']:
    sc.pl.violin(adata, i, jitter=False, groupby='Sample', save='_'+i+'.pdf', rotation=90)
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_nCount_RNA.pdf
No description has been provided for this image
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_nFeature_RNA.pdf
No description has been provided for this image
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_percent.mt.pdf
No description has been provided for this image
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_percent.ribo.pdf
No description has been provided for this image
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_TSSEnrichment.pdf
No description has been provided for this image
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_nFrags.pdf
No description has been provided for this image
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/violin_decontX_contamination.pdf
No description has been provided for this image
InĀ [15]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer="raw",
    categorical_covariate_keys=["Sample"],
    continuous_covariate_keys=["percent.mt", "nFeature_RNA"]
)
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/scvi/data/fields/_layer_field.py:78: UserWarning: adata.layers[raw] does not contain unnormalized count data. Are you sure this is what you want?
  warnings.warn(
InĀ [16]:
model = scvi.model.SCVI(adata, n_latent=10)
model
SCVI Model with the following params: 
n_hidden: 128, n_latent: 10, n_layers: 1, dropout_rate: 0.1, dispersion: gene, 
gene_likelihood: zinb, latent_distribution: normal
Training status: Not Trained
Out[16]:

InĀ [17]:
model.train()
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/torchmetrics/utilities/prints.py:36: UserWarning: Torchmetrics v0.9 introduced a new argument class property called `full_state_update` that has
                not been set for this class (ElboMetric). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchmetrics.utilities import check_forward_no_full_state`
                that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
                default for now) or if `full_state_update=False` can be used safely.
                
  warnings.warn(*args, **kwargs)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Epoch 1/346:   0%|          | 0/346 [00:00<?, ?it/s]
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/scvi/distributions/_negative_binomial.py:436: UserWarning: The value argument must be within the support of the distribution
  warnings.warn(
Epoch 346/346: 100%|ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ| 346/346 [12:37<00:00,  2.19s/it, loss=776, v_num=1]
InĀ [18]:
# save model 
import os
model.save(os.path.join(DIR2SAVE,'scvi_model_hvg_mt'))
InĀ [19]:
# read the model
model = scvi.model.SCVI.load(os.path.join(DIR2SAVE,'scvi_model_hvg_mt'), adata=adata)#, use_gpu=True)
INFO     File /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/scvi_model_hvg_mt
         /model.pt already downloaded                                                        
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/scvi/data/fields/_layer_field.py:78: UserWarning: adata.layers[raw] does not contain unnormalized count data. Are you sure this is what you want?
  warnings.warn(
InĀ [20]:
latent = model.get_latent_representation() # get model output 
InĀ [21]:
print(latent.shape)
# It’s often useful to store the outputs of scvi-tools back into the original anndata, as it permits interoperability with 
# Scanpy.
adata.obsm["X_scVI"] = latent

# Let’s store the normalized values back in the anndata.
adata.layers["scvi_normalized"] = model.get_normalized_expression(adata=adata, library_size=1e4)

print("Number of PCs:", latent.shape[1])
# use scVI latent space for UMAP generation
# set nb pcs to latent representation shape of scvi
# compute neighbourhood graph
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=latent.shape[1], knn=True, method='umap', metric='euclidean',
                use_rep="X_scVI", random_state=7)

# compute UMAP embedding 
sc.tl.umap(adata, min_dist=0.3, n_components=2, random_state=7)
(23119, 10)
Number of PCs: 10
InĀ [22]:
sc.pl.umap(adata, color='Sample')
adata.obs['Cell_type_preDecon'] = adata.obs['Cell_type_preDecon'].astype("category")
sc.pl.umap(adata, color='Cell_type_preDecon')
No description has been provided for this image
No description has been provided for this image
InĀ [23]:
sc.pl.umap(adata, color=['nCount_RNA', 'nFeature_RNA','percent.mt','percent.ribo', 'TSSEnrichment', 'nFrags',
                        'decontX_contamination'],
           ncols=4,color_map='viridis_r', vmax='p99',
          save='_allCells_scVI_QC.pdf')
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/umap_allCells_scVI_QC.pdf
No description has been provided for this image
InĀ [24]:
genes=['EPCAM','ELF3','HNF4A',
          'SLC11A1','CD68','LYZ',
          'CD79A','BANK1','BLK',
          'CD3E','PTPRC','TRAC',
          'ALB','CP','DEFB1',
          'PECAM1','ARL15','VWF',
          'DCN','ACTA2','COL3A1',
          'PROX1']
sc.pl.umap(adata, color=genes, use_raw=True, ncols=6, vmax='p99', vmin=1, color_map='plasma_r',
          save='_allCells_scVI_markers.pdf')
WARNING: saving figure to file /data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Figures/umap_allCells_scVI_markers.pdf
No description has been provided for this image
InĀ [25]:
sc.tl.leiden(adata, resolution=0.5)
sc.pl.umap(adata, color='leiden')
sc.pl.umap(adata, color='leiden', legend_loc = 'on data')
No description has been provided for this image
No description has been provided for this image
InĀ [26]:
### Find marker genes
#del adata.uns['log1p']
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon',pts=True, use_raw=True)
sc.pl.rank_genes_groups(adata, n_genes=15, sharey=False)
No description has been provided for this image
InĀ [27]:
pval_thresh = 0.05
log2fc_thresh = 0.25
pct_cutoff = 0.1
cluster_de_genes = dict()
for cluster in sorted(set(adata.obs['leiden'])):
    cluster_de_genes[cluster] = sc.get.rank_genes_groups_df(adata,
                                                            group=cluster, 
                                                            key='rank_genes_groups', 
                                                            pval_cutoff=pval_thresh, 
                                                            log2fc_min=log2fc_thresh, 
                                                            log2fc_max=None).sort_values('logfoldchanges', 
                                                                                         ascending=False)
    cluster_de_genes[cluster] = cluster_de_genes[cluster][cluster_de_genes[cluster]['pct_nz_group'] > pct_cutoff]

with pd.ExcelWriter('allCells_leiden_wilcoxon.xls') as writer:  
    for cluster in list(cluster_de_genes.keys()):      
        cluster_de_genes[cluster].to_excel(writer, sheet_name='cluster{}'.format(cluster))
/data/BCI-CRC/SO/anaconda/envs/cell2loc19/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3338: FutureWarning: As the xlwt package is no longer maintained, the xlwt engine will be removed in a future version of pandas. This is the only engine in pandas that supports writing in the xls format. Install openpyxl and write to an xlsx file instead. You can set the option io.excel.xls.writer to 'xlwt' to silence this warning. While this option is deprecated and will also raise a warning, it can be globally set and the warning suppressed.
  if await self.run_code(code, result, async_=asy):
InĀ [28]:
adata.write('AllCells_scVI.h5ad')
InĀ [3]:
adata = sc.read('AllCells_scVI.h5ad')
adata
Out[3]:
AnnData object with n_obs Ɨ n_vars = 23119 Ɨ 2000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Sample', 'Patient', 'Therapy', 'Tissue', 'TSSEnrichment', 'nFrags', 'percent.mt', 'percent.ribo', 'RNA_snn_res.0.5', 'seurat_clusters', 'integrated_snn_res.0.5', 'Clusters_all_cells_preDecon', 'Cell_type_preDecon', 'ident', 'decontX_contamination', 'decontX_clusters', 'integratedRNADecon_snn_res.0.5', '_scvi_batch', '_scvi_labels', 'leiden'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'Cell_type_preDecon_colors', 'Sample_colors', '_scvi_manager_uuid', '_scvi_uuid', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'rank_genes_groups', 'umap'
    obsm: 'X_CCA', 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs', '_scvi_extra_continuous_covs'
    layers: 'log1p', 'normalised', 'raw', 'scvi_normalized'
    obsp: 'connectivities', 'distances'
InĀ [5]:
old_to_new = {'0':'Epithelial',
              '1':'Epithelial',
              '2':'Epithelial',
              '3':'Epithelial',
              '4':'Myeloid',
              '5':'Epithelial',
              '6':'T/NK/ILC',
              '7':'Fibroblast',
              '8':'Hepatocyte',
              '9':'Endothelial',
              '10':'Epithelial',
              '11':'B',
              '12':'Endothelial'    
}

adata.obs['Cell_type'] = (adata.obs['leiden'].map(old_to_new).astype('category'))

sc.pl.umap(adata, color=['Cell_type','Cell_type_preDecon'])
No description has been provided for this image
InĀ [6]:
adata.write('CRCLM_decon_scvi.h5ad')
np.savetxt('CRCLM_decon_scvi_XscVI.csv', adata.obsm['X_scVI'], delimiter=',')
np.savetxt('CRCLM_decon_scvi_Xumap.csv', adata.obsm['X_umap'], delimiter=',')
adata.obs.to_csv('CRCLM_decon_scvi_metadata.csv')